# Ryan Copus, Ryan Hübert and Paige Pellaton
# "Trading Diversity? Judicial Diversity and Case Outcomes in Federal Courts"
# American Political Science Review

# File name: chp_apsr_02_plaintiffs2.R
# Last revision date: May 30, 2024
# Questions or comments? Contact Ryan Hübert: https://ryanhubert.com/

# What does this script do?
# This script takes a dataframe of human plaintiff names for each case in the 
# dataset and predicts their genders and races/ethnicities. 

# Last pre-print execution of this code: 
# > Date: May 1, 2024 
# > Machine: MacBook Pro 14" (2021 model) with Apple M1 Max chip and 64 GB RAM
# > OS: macOS Sonoma 14.4
# > R: version 4.3.2

################################################################################
# IMPORTANT NOTE ABOUT DATA LIMITATIONS
################################################################################

# In the publicly available version of the replication files, all plaintiff
# names have been redacted. As a result, execution of this script will not be 
# successful. Please contact the author(s) for more information.

################################################################################
# Before you run this script...
################################################################################

# Clear the workspace
rm(list = ls())

# You must create a U.S. Census API key and then place the key in a text file 
# named `census_key.txt` and saved in the `outs` subdirectory in the main 
# replication directory.
# Sign up for the key here: https://api.census.gov/data/key_signup.html

################################################################################
# Load packages and set options
################################################################################

require(tidyverse)
require(gender) # for gender coding
require(tidycensus) # for wru race coding
require(wru) # for wru race coding
require(predictrace) # for predictrace race coding

# Turn off messages from summarize function
options(dplyr.summarise.inform = FALSE)

################################################################################
# Directory management
################################################################################

# Define and set the working directory.
wdir <- gsub("/[Cc]ode/?","",getwd())
setwd(wdir)

################################################################################
# Load and perform minor cleaning of plaintiff name dataset
################################################################################

# Load the dataframe with case data
df <- read_csv(paste0(wdir,"/data/chp_apsr_case_data.csv"), show_col_types = FALSE)
df <- distinct(df[which(is.na(df$to_drop) & df$df_main==1), c("CASE_ID","OPEN_ID")])

# Load the dataframe containing plaintiffs' names 
pf <- read_csv(paste0(wdir, "/outs/chp_apsr_plaintiffs.csv"), show_col_types = FALSE)
pf <- distinct(pf[pf$CASE_ID %in% df$CASE_ID,])

# Drop any plaintiff that has no first, middle or last name
pf <- pf[(!is.na(pf$fn))|(!is.na(pf$mn))|(!is.na(pf$ln)),]

# Drop any party with multiple names
tmp <- count(group_by(pf, file, CASE_ID, pid))
print(paste0("Dropping ", round(nrow(tmp[tmp$n > 1,])/nrow(tmp),3)*100,"% of parties in dataset because they have multiple names."))
pf <- left_join(pf, tmp[tmp$n > 1,], by = c("file" = "file", "CASE_ID" = "CASE_ID", "pid" = "pid"))
pf <- pf[which(is.na(pf$n)),]
rm(tmp)

# Clean up county codes
fips2 <- distinct(mutate(fips_codes[,c("state","state_code")], state = str_to_lower(state)))
pf$state_code <- str_extract(pf$COUNTY_CODE,"^([0-9]{2})")
pf <- left_join(pf, fips2, by = join_by(state_code))
pf$COUNTY_CODE[which(pf$state=="vi")] <- NA ## Can't get census data for Virgin Islands 
pf$COUNTY_CODE <- str_extract(pf$COUNTY_CODE,"([0-9]{3})$")

# Change Miami-Dade County FIPS code
pf$COUNTY_CODE[pf$state=="FL" & pf$COUNTY_CODE=="025"] <- "086"

# Make state names uppercase
pf$state <- str_to_upper(pf$state)

# Which Census year to use for the wru coding?
pf$chunk <- str_extract(pf$CASE_ID, "[-]([0-9][0-9])[-]", group = 1)
pf$chunk[grepl("^(9[0-9]|0[0-9]|1[0-4])$", pf$chunk)] <- "2010"
pf$chunk[grepl("^(1[5-9]|2[0-4])$", pf$chunk)] <- "2020"
pf$chunk <- as.numeric(pf$chunk)

# Make a unique ID for every case-plaintiff pair
pf <- arrange(pf, CASE_ID, pid)
pf$id <- paste0(pf$CASE_ID,"-",str_pad(pf$pid, 4, "left", "0"))
max(count(pf,id)$n)

################################################################################
# Predict plaintiffs' genders
################################################################################

lgen <- tibble(pf)

# Method 1: predict gender using historical data (SSA method)

# Run gender function on first and middle names
lgender1 <- distinct(rename(gender(names = lgen$fn, method = "ssa"), fn = name, gender_ssa_fn = gender, pred.male_fn = proportion_male)[,c("fn", "pred.male_fn","gender_ssa_fn")])
lgender2 <- distinct(rename(gender(names = lgen$mn, method = "ssa"), mn = name, gender_ssa_mn = gender, pred.male_mn = proportion_male)[,c("mn", "pred.male_mn","gender_ssa_mn")])

# Merge results into the main dataframe
lgen <- left_join(lgen, lgender1, by = c("fn" = "fn"))
lgen <- left_join(lgen, lgender2, by = c("mn" = "mn"))

# If we predict gender with first name, use that; otherwise use prediction for middle name
lgen$gender_ssa <- lgen$gender_ssa_fn
lgen$gender_ssa[is.na(lgen$gender_ssa)] <- lgen$gender_ssa_mn[is.na(lgen$gender_ssa)]

lgen$pred.male_ssa <- lgen$pred.male_fn
lgen$pred.male_ssa[is.na(lgen$pred.male_ssa)] <- lgen$pred.male_mn[is.na(lgen$pred.male_ssa)]

# For all plaintiffs where we have either a first or middle name, look at stats
name_mask <- (!is.na(lgen$mn) | !is.na(lgen$fn))
print(table(lgen$gender_ssa[name_mask], exclude = NULL)) 
print(round(prop.table(table(lgen$gender_ssa[name_mask], exclude = NULL)), 3) * 100) 

lgen <- lgen[,c(colnames(lgen)[1:12], "gender_ssa", "pred.male_ssa")]

# Method 2: predict gender using the Integrated Public Use Microdata Series (ipums method)

# Run gender function on first and middle names
lgender1 <- distinct(rename(gender(names = lgen$fn, method = "ipums"), fn = name, gender_ipums_fn = gender, pred.male_fn = proportion_male)[,c("fn","pred.male_fn","gender_ipums_fn")])
lgender2 <- distinct(rename(gender(names = lgen$mn, method = "ipums"), mn = name, gender_ipums_mn = gender, pred.male_mn = proportion_male)[,c("mn","pred.male_mn","gender_ipums_mn")])

# Merge results into the main dataframe
lgen <- left_join(lgen, lgender1, by = c("fn" = "fn"))
lgen <- left_join(lgen, lgender2, by = c("mn" = "mn"))

# If we predict gender with first name, use that; otherwise use prediction for middle name
lgen$gender_ipums <- lgen$gender_ipums_fn
lgen$gender_ipums[is.na(lgen$gender_ipums)] <- lgen$gender_ipums_mn[is.na(lgen$gender_ipums)]

lgen$pred.male_ipums <- lgen$pred.male_fn
lgen$pred.male_ipums[is.na(lgen$pred.male_ipums)] <- lgen$pred.male_mn[is.na(lgen$pred.male_ipums)]

# For all plaintiffs where we have either a first or middle name, look at stats
name_mask <- (!is.na(lgen$mn) | !is.na(lgen$fn))
print(table(lgen$gender_ipums[name_mask], exclude = NULL)) 
print(round(prop.table(table(lgen$gender_ipums[name_mask], exclude = NULL)), 3) * 100)

lgen <- lgen[,c(colnames(lgen)[1:12], "gender_ssa", "pred.male_ssa", "gender_ipums", "pred.male_ipums")]

# Intercoder reliability check for gender methods (SSA and IPUMS)
ctab <- prop.table(table(lgen$gender_ssa, lgen$gender_ipums, exclude = NULL))
print(paste0("Agreement for ",round(sum(diag(ctab)),2)*100,"% of plaintiffs."))
rm(ctab)

# Create "gender" category
# 1. For which plaintiffs do the raters (ssa vs. ipums) disagree?
lgen[which(lgen$gender_ssa != lgen$gender_ipums), c("fn","mn","ln","gender_ssa","gender_ipums") ]
# Looks like gender_ssa is more likely to be accurate from visual inspection

# 2. Make an overall "gender" column such that if gender_ssa is missing, we use gender_ipums, else gender_ssa
lgen$gender <- ifelse(is.na(lgen$gender_ssa), lgen$gender_ipums, lgen$gender_ssa)
lgen$pred.male <- ifelse(is.na(lgen$pred.male_ssa), lgen$pred.male_ipums, lgen$pred.male_ssa)

lgen <- distinct(tibble(lgen[,c("CASE_ID", "pid", "gender", "pred.male")]))

################################################################################
# Predict plaintiffs' races and/or ethnicities (using wru)
################################################################################

# Load your Census API key -- see instructions at the top of this script
if(file.exists(paste0(wdir, "/outs/census_key.txt"))){
  my_key <- str_trim(readLines(paste0(wdir, "/outs/census_key.txt")))
} else {
  stop("Did you create and save a U.S. Census API key according to the instructions at the top of this R script?")
}

## What are all the states that appear in our dataset? 
allstates <- str_to_upper(sort(unique(pf$state[!is.na(pf$state) & !pf$state %in% c("VI","GU","PR")])))

lwru <- NULL
for(chunk in unique(pf$chunk)){
  # chunk <- 2010
  
  # Load census data
  ## Note: you cannot load Census data for Virgin Islands, Guam or Puerto Rico
  
  census_data <- get_census_data(key = my_key, states = allstates, year = chunk, census.geo = "county")
  
  # Predict race on surnames for plaintiffs WITH county codes
  vmask <- pf$chunk==chunk & !is.na(pf$ln) & (pf$state %in% allstates) & !is.na(pf$COUNTY_CODE)
  if(any(vmask)){
    lwru1 <- wru::predict_race(voter.file = rename(pf[vmask,], surname = ln, county = COUNTY_CODE),
                               census.surname = TRUE, 
                               surname.only = FALSE, 
                               census.geo = "county", 
                               impute.missing = TRUE, 
                               year = chunk,
                               skip_bad_geos = TRUE,
                               census.data = census_data)
    lwru <- distinct(bind_rows(lwru, lwru1))
    rm(lwru1)
  }
  
  # Predict race on surnames for plaintiffs WITHOUT county codes
  vmask <- pf$chunk==chunk & !is.na(pf$ln) & (pf$state %in% allstates) & is.na(pf$COUNTY_CODE)
  if(any(vmask)){
    lwru2 <- wru::predict_race(voter.file = rename(pf[vmask,], surname = ln, county = COUNTY_CODE), 
                               census.surname = TRUE, 
                               surname.only = TRUE, 
                               impute.missing = TRUE, 
                               census.data = census_data)
    
    lwru <- distinct(bind_rows(lwru, lwru2))
    rm(lwru2)
  }
}

# Clean for merging
tmp <- gather(lwru[,c("file","CASE_ID","pid","pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth")], 
              "race_wru", "key", c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth"))
tmp <- distinct(filter(mutate(group_by(tmp, file, CASE_ID, pid), max_pred = max(key)), key == max_pred)[,c("file","CASE_ID","pid","key","race_wru")])
tmp$race_wru <- str_replace(tmp$race_wru,"pred[.]","")
length(tmp$race_wru[tmp$key < 0.5])/nrow(tmp)
tmp$race_wru[tmp$key < 0.5 & !is.na(tmp$race_wru)] <- "no dominant prediction"

lwru <- distinct(left_join(lwru,tmp)[,c("file","CASE_ID","pid","pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth","race_wru")])
lwru <- distinct(left_join(pf[,c("file","CASE_ID","pid")],lwru))
lwru$name_used <- "last"
rm(tmp)

# Check that there are no duplicates -- should print a 1
print(max(count(group_by(lwru, file, CASE_ID, pid))$n))

# Clean up race names
lwru$race_wru <- gsub("his", "hispanic", lwru$race_wru)
lwru$race_wru <- gsub("bla", "black", lwru$race_wru)
lwru$race_wru <- gsub("whi", "white", lwru$race_wru)
lwru$race_wru <- gsub("asi", "asian", lwru$race_wru)
lwru$race_wru <- gsub("oth", "other", lwru$race_wru)

lwru <- distinct(tibble(lwru[,c("CASE_ID", "pid", "race_wru", "name_used", "pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth")]))

## Create a distribution of the wru plaintiff classifications (Table A.1 in the SI)

wrudist <- summarise(group_by(lwru, race_wru), prop = n()/nrow(lwru), 
                `Mean Pr(White)` = str_pad(round(mean(pred.whi),3),5,"right","0"), 
                `Mean Pr(Black)` = str_pad(round(mean(pred.bla),3),5,"right","0"), 
                `Mean Pr(Hispanic)` = str_pad(round(mean(pred.his),3),5,"right","0"), 
                `Mean Pr(Asian)` = str_pad(round(mean(pred.asi),3),5,"right","0"), 
                `Mean Pr(Other)` = str_pad(round(mean(pred.oth),3),5,"right","0"))
wrudist <- bind_rows(arrange(wrudist[!(is.na(wrudist$race_wru) | grepl("no dom",wrudist$race_wru)),], desc(prop)), 
                arrange(wrudist[is.na(wrudist$race_wru) | grepl("no dom",wrudist$race_wru),], desc(prop)))
wrudist[is.na(wrudist$race_wru),3:ncol(wrudist)] <- "---"
wrudist[is.na(wrudist$race_wru),1] <- "None"
wrudist$race_wru <- paste0(str_to_title(wrudist$race_wru)," (",floor(100*round(wrudist$prop,3)),"\\%)")
wrudist$prop <- NULL
colnames(wrudist)[1] <- "Classified Plaintiff Race"
print(wrudist)
write_csv(wrudist, paste0(wdir,"/outs/chp_apsr_wru_distribution.csv"))
rm(wrudist)


################################################################################
# Predict plaintiffs' races and/or ethnicities (using predictrace)
################################################################################

# Predict race of first names
lpr1 <- bind_cols(pf[,c("CASE_ID","pid","id")], predictrace::predict_race(pf$fn, probability = TRUE, surname = FALSE))
lpr1 <- rename(lpr1, race_pr = likely_race)
colnames(lpr1) <- gsub("probability_","pred.", colnames(lpr1))

# Predict race of surnames
lpr2 <- bind_cols(pf[,c("CASE_ID","pid","id")], predictrace::predict_race(pf$ln, probability = TRUE, surname = TRUE))
lpr2 <- rename(lpr2, race_pr = likely_race)
colnames(lpr2) <- gsub("probability_","pred.",colnames(lpr2))

# More missingness in first_name than last_name, so impute as needed
nrow(lpr1[is.na(lpr1$race_pr),])
nrow(lpr2[is.na(lpr2$race_pr),])

lpr2 <- mutate(lpr2[!is.na(lpr2$race_pr),], name_used = "last")
lpr1 <- mutate(lpr1[!is.na(lpr1$race_pr) & !lpr1$id %in% lpr2$id,], name_used = "first")
lpr <- bind_rows(lpr1,lpr2)
lpr <- left_join(pf[,c("CASE_ID","pid")], lpr[,c("CASE_ID","pid", colnames(lpr)[which(grepl("(race_pr)", colnames(lpr))):length(colnames(lpr))])])

# No dominant predictions
tmp <- gather(lpr[,c("CASE_ID","pid","pred.american_indian", "pred.asian", "pred.black", "pred.hispanic", "pred.white", "pred.2races")], 
              "race_pr", "key", c("pred.american_indian", "pred.asian", "pred.black", "pred.hispanic", "pred.white", "pred.2races"))
tmp <- filter(group_by(tmp, CASE_ID, pid), key == max(key))
tmp <- left_join(bind_cols(lpr[,c("CASE_ID","pid"),],
                           rowSums(lpr[,c("pred.american_indian", "pred.asian", "pred.black", "pred.hispanic", "pred.white", "pred.2races")])),
                 tmp)
tmp$key <- tmp$key/tmp$...3

lpr <- left_join(lpr, tmp[,c("CASE_ID","pid","key")])
lpr$race_pr[lpr$key<0.5 & !is.na(lpr$race_pr)] <- "no dominant prediction"

# Clean up race names
lpr$race_pr[grepl(",",lpr$race_pr)] <- "2races"

lpr <- distinct(tibble(lpr[,c("CASE_ID", "pid", "race_pr", "name_used", colnames(lpr)[grepl("pred[.]",colnames(lpr))])]))

# Intercoder reliability check for race methods (predictrace, wru)
icr_check <- full_join(lwru[,c("CASE_ID","pid","race_wru")], lpr[,c("CASE_ID","pid","race_pr")])
icr_check <- icr_check[,c("race_wru","race_pr")]
icr_check$race_pr[icr_check$race_pr %in% unique(icr_check$race_pr)[!(unique(icr_check$race_pr) %in% unique(icr_check$race_wru))]] <- "other"

ctab1 <- count(icr_check, race_wru, race_pr)
ctab1$p <- ctab1$n/sum(ctab1$n)

races <- c("asian","black","hispanic","white","other")
mask <- (ctab1$race_wru==ctab1$race_pr) & (ctab1$race_wru %in% races)
print(paste0("Agreement for ",round(sum(ctab1$p[which(mask)])*100,1),"% of plaintiffs."))
mask <- (ctab1$race_wru!=ctab1$race_pr) & (ctab1$race_wru %in% races)
print(paste0("Disgreement for ",round(sum(ctab1$p[which(mask)])*100,1),"% of plaintiffs."))
mask <- (is.na(ctab1$race_wru) | grepl("no dominant", ctab1$race_wru)) & !(is.na(ctab1$race_pr) | grepl("no dominant", ctab1$race_pr))
print(paste0("Missing for race_wru and not for race_pr: ",round(sum(ctab1$p[which(mask)])*100,1),"%."))
mask <- !(is.na(ctab1$race_wru) | grepl("no dominant", ctab1$race_wru)) & (is.na(ctab1$race_pr) | grepl("no dominant", ctab1$race_pr))
print(paste0("Missing for race_pr and not for race_wru: ",round(sum(ctab1$p[which(mask)])*100,1),"%."))
mask <- (is.na(ctab1$race_wru) | grepl("no dominant", ctab1$race_wru)) & (is.na(ctab1$race_pr) | grepl("no dominant", ctab1$race_pr))
print(paste0("Missing for race_pr and not for race_wru: ",round(sum(ctab1$p[which(mask)])*100,1),"%."))

################################################################################
# Clean up for use in analysis
################################################################################

# Merge everything together (still at the plaintiff level)
xf <- distinct(pf[,c("file","CASE_ID", "pid")])
xf <- left_join(xf, lgen[,c("CASE_ID","pid","gender")], by = join_by(CASE_ID, pid))
xf <- left_join(xf, lwru[,c("CASE_ID","pid","race_wru")], by = join_by(CASE_ID, pid))
xf <- left_join(xf, lpr[,c("CASE_ID","pid","race_pr")], by = join_by(CASE_ID, pid))

# Treat "no dominant prediction" like missing data
xf$race_pr[which(xf$race_pr=="no dominant prediction")] <- NA
xf$race_wru[which(xf$race_wru=="no dominant prediction")] <- NA

# Which categories will we look at in the paper? Code others as "other"
race_cats <- c("white", "black")
xf$race_pr[which(!xf$race_pr %in% race_cats & !is.na(xf$race_pr))] <- "other"
xf$race_wru[which(!xf$race_wru %in% race_cats & !is.na(xf$race_wru))] <- "other"

# Plaintiffs classified as neither male nor female will be treated as missing
xf$gender[which(!xf$gender %in% c("male","female"))] <- NA

# Make a case level dataset
lf <- as_tibble(rename(xf, race_w = race_wru, race_p = race_pr))
lf <- lf[,c("CASE_ID", "pid", "gender", "race_w", "race_p")]
lf <- fastDummies::dummy_cols(lf[,c("CASE_ID", "pid", "gender", "race_w", "race_p")], c("gender", "race_w", "race_p"), ignore_na = TRUE)

lf <- select(lf, !c(race_w,gender,race_p,race_w_other,race_p_other))
lf$race_w_nonwhite <- ifelse(lf$race_w_white == 0, 1, ifelse(!is.na(lf$race_w_white), 0, NA))
lf$race_p_nonwhite <- ifelse(lf$race_p_white == 0, 1, ifelse(!is.na(lf$race_p_white), 0, NA))

colnames(lf) <- gsub("race_([pw])_([a-z]+)","\\2_\\1",colnames(lf))
colnames(lf) <- gsub("(?:nonwhite|latino_asian)_([pw])_([a-z_]+)","\\2_\\1",colnames(lf))
colnames(lf) <- gsub("gender_([a-z]+)","\\1",colnames(lf))

cf <- arrange(distinct(lf[,c("CASE_ID")]), CASE_ID)

# Make the case level dataset
for(v in colnames(lf)[3:length(colnames(lf))]){
  ## how many plaintiffs did we identify in each case
  q1 <- mutate(group_by(lf[,c("CASE_ID", "pid", v)], CASE_ID), p.ided = sum(!is.na(!!sym(v)))/n())
  
  ## only if we ided most of the plaintiffs (>90%)
  q1 <- q1[q1$p.ided > 0.90,]
  
  ## what % of plaintiffs were this identity
  q1 <- mutate(group_by(q1[!is.na(q1[[v]]),c("CASE_ID",v)], CASE_ID), !!sym(v) := mean(!!sym(v)))
  
  cf <- left_join(cf, distinct(q1))
}
cf <- cf[,c("CASE_ID","female","male",sort(colnames(cf)[grepl("_w",colnames(cf))]),sort(colnames(cf)[grepl("_p",colnames(cf))]))]

# Need to get the OPEN_ID numbers and merge in
df <- arrange(distinct(df), OPEN_ID)
df <- left_join(df, cf, by = c("CASE_ID" = "CASE_ID"))
df <- distinct(df)

colnames(df)[3:ncol(df)] <- paste0(tolower(str_extract("Plaintiff", "^[A-z]{3}")), "_", colnames(df)[3:ncol(df)])

write_csv(df, paste0(wdir,"/outs/chp_apsr_plaintiff_identities.csv"))